{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import operator\n",
    "from math import log"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#生成数据函数\n",
    "def creatdataset():\n",
    "    dataset=[[1,1,'yes'],\n",
    "             [1,1,'yes'],\n",
    "             [1,0,'no'],\n",
    "             [0,1,'no'],\n",
    "             [0,1,'no']]\n",
    "    labels=[\"no surfacing\",\"flippers\"]\n",
    "    return dataset,labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] ['no surfacing', 'flippers']\n",
      "5\n",
      "2\n"
     ]
    }
   ],
   "source": [
    "dataset,labels=creatdataset()\n",
    "print dataset,labels\n",
    "print len(dataset)\n",
    "print len(labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#创建计算香浓熵的函数\n",
    "def calcShannonEnt(dataset):\n",
    "    classcount={}\n",
    "    for featvec in dataset:\n",
    "        currentlabel = featvec[-1]\n",
    "        if currentlabel not in classcount.keys():\n",
    "            classcount[currentlabel]=0\n",
    "        classcount[currentlabel]+=1\n",
    "    ShannonEnt=0.0\n",
    "    for each in classcount:   #这里的each是classcount的键\n",
    "        p = classcount[each]/float(len(dataset))\n",
    "        ShannonEnt-=p*log(p,2)\n",
    "    return ShannonEnt            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.970950594455\n"
     ]
    }
   ],
   "source": [
    "#测试信息熵函数\n",
    "ShannonEnt=calcShannonEnt(dataset)\n",
    "print ShannonEnt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#按条件划分数据集\n",
    "def splitDataset(dataset,axis,value):   #要划分的数据集 数据集的第几个特征 按特征值的值进行划分\n",
    "    retdataset=[]\n",
    "    for featvec in dataset:\n",
    "        if featvec[axis] == value:\n",
    "            #如果符合要求，将所有符合要求的数据提取出来并减去该特征值对应的数据列\n",
    "            reducedfeatvec = featvec[:axis]\n",
    "            reducedfeatvec.extend(featvec[axis+1:])\n",
    "            retdataset.append(reducedfeatvec)\n",
    "    return retdataset\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[1, 'yes'], [1, 'yes'], [0, 'no']]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#测试划分数据集的效果\n",
    "retdataset=splitDataset(dataset,0,1)\n",
    "retdataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#选择最好的特征值进行数据分割\n",
    "#按不同的特征值和特征值的值进行分割然后计算信息熵。进行排序。\n",
    "def ChooseBestFeatureToSplit(dataset):\n",
    "    numfeatures=len(dataset[0])-1   #共有几个特征属性\n",
    "    #计算分割数据之前的信息熵\n",
    "    bestFeature = -1\n",
    "    bestinfogain=0.0\n",
    "    basenonEnt=calcShannonEnt(dataset)\n",
    "    for i in range(numfeatures):\n",
    "        newEntropy=0.0\n",
    "        #遍历所有行收集所有的取值然后去重\n",
    "        featlist=[example[i] for example in dataset]\n",
    "        uniquevals=set(featlist)  #去重\n",
    "        for value in uniquevals:\n",
    "            #按特征属性和属性值分割数据集\n",
    "            retmat=splitDataset(dataset,i,value)\n",
    "            #计算新数据集占总数据集的权重\n",
    "            prob=len(retmat)/float(len(dataset))\n",
    "            newEntropy+=prob*calcShannonEnt(retmat)\n",
    "        infogain=basenonEnt-newEntropy  #计算减少的信息熵\n",
    "        if infogain>bestinfogain:\n",
    "            bestinfogain=infogain\n",
    "            bestfeature = i\n",
    "    return bestfeature  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "#检验上函数的正确性\n",
    "dataset,labels=creatdataset()\n",
    "bestfeature=ChooseBestFeatureToSplit(dataset)\n",
    "print bestfeature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#当消耗完所有的属性时，数据集的标签依然没有统一，这时采用多数决定的方法。\n",
    "def majorityCnt(classlist):\n",
    "    classCount = {}\n",
    "    for each in classlist:\n",
    "        if each not in classCount:\n",
    "            classCount[each]=0\n",
    "        classCount[each]+=1\n",
    "    sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)\n",
    "    return sortedClassCount[0][0]\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def createTree(dataSet,labels):\n",
    "    classList = [example[-1] for example in dataSet]\n",
    "    if classList.count(classList[0]) == len(classList): \n",
    "        return classList[0]#stop splitting when all of the classes are equal\n",
    "    if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet\n",
    "        return majorityCnt(classList)\n",
    "    bestFeat = ChooseBestFeatureToSplit(dataSet)\n",
    "    bestFeatLabel = labels[bestFeat]\n",
    "    myTree = {bestFeatLabel:{}}\n",
    "    del(labels[bestFeat])\n",
    "    featValues = [example[bestFeat] for example in dataSet]\n",
    "    uniqueVals = set(featValues)\n",
    "    for value in uniqueVals:\n",
    "        subLabels = labels[:]       #copy all of labels, so trees don't mess up existing labels\n",
    "        myTree[bestFeatLabel][value] = createTree(splitDataset(dataSet, bestFeat, value),subLabels)\n",
    "    return myTree          "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tree=createTree(dataset,labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}\n"
     ]
    }
   ],
   "source": [
    "print tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#创建分类器\n",
    "def classify(inputTree,featLabels,testVec):  #参数分别为以字典形式组织的决策树，特征列表和要分类的特征向量\n",
    "    firstStr = inputTree.keys()[0]\n",
    "    secondDict = inputTree[firstStr]\n",
    "    featIndex = featLabels.index(firstStr)\n",
    "    key = testVec[featIndex]\n",
    "    valueOfFeat = secondDict[key]\n",
    "    if isinstance(valueOfFeat, dict): \n",
    "        classLabel = classify(valueOfFeat, featLabels, testVec)\n",
    "    else: classLabel = valueOfFeat\n",
    "    return classLabel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]\n",
      "['no surfacing', 'flippers']\n"
     ]
    }
   ],
   "source": [
    "data,label=creatdataset()\n",
    "print data\n",
    "print label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "no\n"
     ]
    }
   ],
   "source": [
    "classLabel=classify(tree,label,[1,0])\n",
    "print classLabel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#决策树的存储 持久化存储\n",
    "def storeTree(inputTree,filename):\n",
    "    import pickle\n",
    "    fw = open(filename,'w')\n",
    "    pickle.dump(inputTree,fw)\n",
    "    fw.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def grabTree(filename):\n",
    "    import pickle\n",
    "    fr = open(filename)\n",
    "    return pickle.load(fr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#测试存取决策树的函数\n",
    "storeTree(tree,\"mytree.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grabTree(\"mytree.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#用决策树算法预测隐形眼睛的镜片类型\n",
    "f=open(\"lenses.txt\",\"r\")\n",
    "buf=f.readlines()\n",
    "lenses_list=[]\n",
    "for example in buf:\n",
    "    li=example.strip().split(\"\\t\")\n",
    "    lenses_list.append(li)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['young', 'myope', 'no', 'reduced', 'no lenses'], ['young', 'myope', 'no', 'normal', 'soft'], ['young', 'myope', 'yes', 'reduced', 'no lenses'], ['young', 'myope', 'yes', 'normal', 'hard'], ['young', 'hyper', 'no', 'reduced', 'no lenses'], ['young', 'hyper', 'no', 'normal', 'soft'], ['young', 'hyper', 'yes', 'reduced', 'no lenses'], ['young', 'hyper', 'yes', 'normal', 'hard'], ['pre', 'myope', 'no', 'reduced', 'no lenses'], ['pre', 'myope', 'no', 'normal', 'soft'], ['pre', 'myope', 'yes', 'reduced', 'no lenses'], ['pre', 'myope', 'yes', 'normal', 'hard'], ['pre', 'hyper', 'no', 'reduced', 'no lenses'], ['pre', 'hyper', 'no', 'normal', 'soft'], ['pre', 'hyper', 'yes', 'reduced', 'no lenses'], ['pre', 'hyper', 'yes', 'normal', 'no lenses'], ['presbyopic', 'myope', 'no', 'reduced', 'no lenses'], ['presbyopic', 'myope', 'no', 'normal', 'no lenses'], ['presbyopic', 'myope', 'yes', 'reduced', 'no lenses'], ['presbyopic', 'myope', 'yes', 'normal', 'hard'], ['presbyopic', 'hyper', 'no', 'reduced', 'no lenses'], ['presbyopic', 'hyper', 'no', 'normal', 'soft'], ['presbyopic', 'hyper', 'yes', 'reduced', 'no lenses'], ['presbyopic', 'hyper', 'yes', 'normal', 'no lenses']]\n"
     ]
    }
   ],
   "source": [
    "print lenses_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lab=[\"age\",\"prescript\",\"astigmatic\",\"tearRate\"]\n",
    "#创建决策树\n",
    "mytree=createTree(lenses_list,lab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'tearRate': {'reduced': 'no lenses', 'normal': {'astigmatic': {'yes': {'prescript': {'hyper': {'age': {'pre': 'no lenses', 'presbyopic': 'no lenses', 'young': 'hard'}}, 'myope': 'hard'}}, 'no': {'age': {'pre': 'soft', 'presbyopic': {'prescript': {'hyper': 'soft', 'myope': 'no lenses'}}, 'young': 'soft'}}}}}}\n"
     ]
    }
   ],
   "source": [
    "print mytree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
